packages = c("readr", "dplyr", "jiebaR", "tidyr", "tidytext", "igraph", "topicmodels", "ggplot2", "stringr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(tibble)
library(ggrepel)
# 文章資料
posts <- read_csv("./12_content.csv")
posts
# 回覆資料
reviews <- read_csv("./12_comment.csv")
reviews
# 選取需要的欄位
reviews <- reviews %>%
select(canonical_url, commenter, reaction, text)
reviews
posts$date <- as.Date(posts$date)
posts %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(date,count)) +
geom_line(color="blue", size=1) +
theme_classic()
length(unique(posts$poster))
[1] 3908
length(unique(reviews$commenter))
[1] 18830
allPoster <- c(posts$poster, reviews$commenter)
length(unique(allPoster))
[1] 20747
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%posts$poster, "poster", "replyer"))
userList["user"]<-apply(userList["user"], 1:length(userList["user"]), function(x) gsub(" .*","", x))
userList
# 把原文與回覆依據url innerJoin起來,這邊直接讀之前join的檔案
posts_Reviews <- read_csv("./post_review.csv")
#posts_Reviews <- merge(x = posts, y = reviews, by = c("canonical_url"))
posts_Reviews
# 取出 commenter(回覆者)、poster(發文者)、canonical_url(文章連結) 、title.x 四個欄位
link <- posts_Reviews %>%
select(commenter,poster, canonical_url, title.x)
link
reviewNetwork <- graph_from_data_frame(d=link, directed=F)
reviewNetwork
IGRAPH 3601ea4 UN-- 19263 2201825 --
+ attr: name (v/c), canonical_url (e/c), title.x (e/c)
+ edges from 3601ea4 (vertex names):
[1] want150 --oftisa ulycess --oftisa want150 --oftisa
[4] DarkKnight--oftisa sentaifans--radiohead56 Marchosias--radiohead56
[7] bluecup --radiohead56 sentaifans--radiohead56 sentaifans--radiohead56
[10] sentaifans--sentaifans sentaifans--GameGyu sentaifans--sentaifans
[13] sentaifans--GameGyu sentaifans--Cosmoswalker sentaifans--hwyi
[16] sentaifans--GameGyu sentaifans--fdtu0928 sentaifans--GameGyu
[19] sentaifans--GameGyu sentaifans--sentaifans sentaifans--sentaifans
[22] sentaifans--impixels sentaifans--GameGyu sentaifans--bluecup
+ ... omitted several edges
# 調整點點的大小和線的粗細,並不顯示使用者賬號。
# 點太多邊太密,必須要篩選資料,這邊就先不畫圖,反正也看不出什麼
set.seed(487)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "blue")
#plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2, vertex.label = NA)
link <- posts_Reviews[posts_Reviews$date.x == as.Date("2019-12-04"), ]
link["poster"]<-apply(link["poster"], 1:length(link["poster"]), function(x) gsub(" .*","", x))
link["commenter"]<-apply(link["commenter"], 1:length(link["commenter"]), function(x) gsub(" .*","", x))
link <- select(link, commenter, poster, canonical_url) %>% unique()
link
# 這邊要篩選link中有出現的使用者
# 因爲如果userList(igraph中graph_from_data_frame的v參數吃的那個東西)中出現了沒有在link中出現的使用者也會被igraph畫上去,圖片就會變得沒有意義
filtered_user <- userList %>%
filter(user%in%link$commenter | user%in%link$poster) %>%
arrange(desc(type)) %>% unique()
filtered_user
# 為了觀察方便及找出活躍鄉民,先移除互動量小於200的用戶
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=T)
reviewNetwork <- delete.vertices(reviewNetwork, V(reviewNetwork)[ degree(reviewNetwork) < 200])
reviewNetwork
IGRAPH 59a7dc4 DN-B 8 32 --
+ attr: name (v/c), type (v/c), canonical_url (e/c)
+ edges from 59a7dc4 (vertex names):
[1] kero2377 ->cutbear123 buoyant0828 ->cutbear123 kero2377 ->kero2377
[4] kero2377 ->kero2377 Sinreigensou->kero2377 howiekuohr ->kero2377
[7] kero2377 ->kero2377 bankingpaul ->howiekuohr buoyant0828 ->bankingpaul
[10] buoyant0828 ->jacklyl kero2377 ->kero2377 buoyant0828 ->kero2377
[13] howiekuohr ->jacklyl kero2377 ->jacklyl howiekuohr ->kero2377
[16] cutbear123 ->kero2377 cutbear123 ->jacklyl howiekuohr ->jacklyl
[19] kero2377 ->kero2377 howiekuohr ->kero2377 howiekuohr ->kero2377
[22] cutbear123 ->kero2377 kero2377 ->howiekuohr Sinreigensou->Sinreigensou
+ ... omitted several edges
# 繪圖
set.seed(487)
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "blue")
plot(reviewNetwork, vertex.size=8, edge.arrow.size=.4, vertex.label=V(reviewNetwork)$label, vertex.label.font=2)
legend("bottomright", c("author","reviewer"), pch=21,
col="#777777", pt.bg=c("gold","blue"), pt.cex=1, cex=1)
我們可以看到基本的使用者關係,但是我們希望能夠將更進階的資訊視覺化。
例如:使用者經常參與的文章種類,或是使用者在該社群網路中是否受到歡迎。
# 文章斷句
ptt_meta <- posts %>%
mutate(sentence=gsub("[\n]{2,}", "。", text))
# 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句
ptt_sentences <- strsplit(ptt_meta$sentence,"[。!;?!?;]")
# 將每句句子,與他所屬的文章連結配對起來,整理成一個dataframe
ptt_sentences <- data.frame(
artUrl = rep(ptt_meta$canonical_url, sapply(ptt_sentences, length)),
sentence = unlist(ptt_sentences)
) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
ptt_sentences$sentence <- as.character(ptt_sentences$sentence)
ptt_sentences
Read 44 items
Read 1225 items
[1] TRUE
## 清理斷詞結果
# 挑出總出現次數大於3的字
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
ptt_removed <- tokens %>%
filter(word %in% reserved_word)
ptt_dtm <- ptt_removed %>% cast_dtm(artUrl, word, count)
ptt_dtm
<<DocumentTermMatrix (documents: 19991, terms: 9623)>>
Non-/sparse entries: 410782/191962611
Sparsity : 100%
Maximal term length: 42
Weighting : term frequency (tf)
# LDA分主題
rowTotals <- apply(ptt_dtm , 1, sum) #Find the sum of words in each Document
ptt_dtm <- ptt_dtm[rowTotals> 0, ]
ptt_lda <- LDA(ptt_dtm, k = 6, control = list(seed = 1000))
# 看各群的常用詞彙
tidy(ptt_lda, matrix = "beta") %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
theme(text = element_text(family = 'TW-Kai')) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
# 使用LDA預測每篇文章的主題
# 在tidy function中使用參數"gamma"來取得 theta矩陣。
ptt_topics <- tidy(ptt_lda, matrix="gamma") %>%
group_by(document) %>%
top_n(1, wt=gamma)
ptt_topics
# 把文章資訊和主題join起來
posts_Reviews <- merge(x = posts_Reviews, y = ptt_topics, by.x = "canonical_url", by.y="document")
posts_Reviews
# 挑選出2019/12的文章,
# 篩選有在15篇以上文章回覆者,
# 欄位只取:commenter(評論者), poster(發文者), canonical_url(文章連結), title.x(主題), reaction(推噓)
link <- posts_Reviews %>%
filter(reaction !="→") %>%
group_by(commenter, canonical_url) %>%
filter(n()>15) %>%
ungroup() %>%
filter(topic == 1 | topic == 6) %>%
select(commenter, poster, canonical_url, title.x, reaction) %>%
unique()
link["poster"]<-apply(link["poster"], 1:length(link["poster"]), function(x) gsub(" .*","", x))
link["commenter"]<-apply(link["commenter"], 1:length(link["commenter"]), function(x) gsub(" .*","", x))
link <- unique(link)
link
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$commenter | user%in%link$poster) %>%
arrange(desc(type)) %>% unique()
filtered_user["user"] <- apply(filtered_user["user"], 1:length(filtered_user["user"]), function(x) gsub(" .*","", x))
filtered_user <- unique(filtered_user)
filtered_user
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=T)
# 刪除degree < 10 的用戶
reviewNetwork <- delete.vertices(reviewNetwork, V(reviewNetwork)[ degree(reviewNetwork) < 10])
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據使用者反應對邊進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$reaction == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5431)
plot(reviewNetwork, vertex.size=5, edge.arrow.size=.2, edge.width=.4,
vertex.label= NA, vertex.label.font=2)
# 加入標示
legend("bottomright", c("author","reviewer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("Like","Boo"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
# 畫出社群網路圖,同上,只是有label
set.seed(5431)
plot(reviewNetwork, vertex.size=5, edge.arrow.size=.2, edge.width=.4,
vertex.label= ifelse(degree(reviewNetwork) > 20, V(reviewNetwork)$label, NA), vertex.label.font=2)
# 加入標示
legend("bottomright", c("author","reviewer"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("Like","Boo"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)